library(haven)
library(foreign)
library(readxl)
library(openxlsx)
library(car)
library(dplyr)
library(readstata13)



########## Import survey data ##########
d <- read.dta13("~/PhD/Survey Data for Opinion_Policy Dataset/DONE Opinion, 1998, februar/MMA0500_F1.dta", encoding="UTF-8")
names(d)

######################################################################
#--------------------------  VARIABLES ------------------------------# 
######################################################################

# INSERT row number of issue in excell docuemnt (next number after previous entry)
issue_row_num = 1

# INSERT a policy item variable (Values = FAV, OPP, DK) and the name of that variable 
# Recode according to rules specified in separate document
d$polpref <- car::recode(d$NAME, " ' '='FAV'; ' '='OPP'; ' '='DK'")
polpref_var_name <- "NAME"

# INSERT Gender variable (1=MALE, 2=FEMALE)
d$gender <- car::recode(d$kjonn, "'Menn'='MALE'; 'Kvinner'='FEMALE'; else=NA")
table(d$gender)

# INSERT Age group variable (categorical, X number of groups, ordered youngest to oldest (1,2,...X), watch out for DK)
# d$age <- factor(d$alder_oevre, labels=c("1","2","3","4","5","6","7","8","9","10","11","12"))
# table(d$age)

# INSERT Numeric AGE variable. Turns it into 10 equal-N categories
d$age <- 1998-d$fodt
d$age[d$age==14] <- NA
d %>% 
  group_by(age = ntile(age, 10)) %>% # split variable into k groups
  mutate(age_cats = paste0(min(age), "-", max(age))) %>% # create the ranges
  ungroup() -> d
table(d$age)

# INSERT Eduation group variable (categorical, X number of groups, ordered lowest to highest (1,2,...X), watch out for DK) 
d$edu <- factor(d$videreutd, labels=c("1","2","3","4"))
table(d$edu)

# INSERT Numeric HOUSHOLD Income variable. Turns it into 10 equal-N categories
d$hincome <- d$husinnt
d %>% 
  group_by(hincome = ntile(hincome, 10)) %>% # split variable into k groups
  mutate(hincome_cats = paste0(min(hincome), "-", max(hincome))) %>% # create the ranges
  ungroup() -> d
table(d$hincome)


#d$income <- d$respinnt
#d %>% 
#  group_by(income = ntile(income, 10)) %>% # split variable into k groups
#  mutate(income_cats = paste0(min(income), "-", max(income))) %>% # create the ranges
#  ungroup() -> d
#table(d$income)


# INSERT Region variable
d$region <- car::recode(d$region, "'Oslo-Akershus'='Oslo og Akershus'; 'Oslo-akershus'='Oslo og Akershus'; 
                        '?vrige ?stland'='?vrige ?stland'; 
                        'Nord-norge'='Nord-Norge';")


# OR 
# INSERT Fylke variable, NB: HEDMARK, ?ST-AGDER
#d$fylke <- car::recode(d$fylke, "'Aust-Agder'='?st-Agder'; 'Hedemark'='Hedmark'")
#table(d$fylke)

# INSERT Party variable (What will they vote if election tomorrow) (Cateogrical, no order, but use same abbreviatons as in master data)
d$party <- car::recode(d$valgimorgen,"
                       'Arbeiderpartiet'='AP'; 'Det norske Arbeiderparti'='AP'; 'Ap'='AP'; 
                       'H?yre'='H'; 'H0yre'='H'; 
                       'AKP (m-l)'='AKP'; 'AKP (Arbeidernes Kommunistparti (m-l)'='AKP'; 
                       'Kommunistene (NKP)'='NKP'; 'NKP'='NKP'; 'Nkp'='NKP'; 
                       'Kristelig Folkeparti'='KRF'; 'KrF'='KRF'; 'Krf'='KRF';
                       'Senterpartiet'='SP'; 'Sp'='SP'; 
                       'Sosialistisk Folkeparti'='SF'; 'Sosialistisk Folkeparti eller Sosialistisk Valgforbund'='SF'; 
                       'Sosialistisk Venstreparti'='SV'; 'SV'='SV'; 'Sv'='SV'; 
                       'Venstre'='V'; 'Det nye Folkeparti'='DNF'; 'Det nye Folkepartiet'='DNF'; 
                       'DLF'='DLF'; 'Dlf'='DLF'; 
                       'Anders Langes Parti'='ALP';
                       'Fremskrittspartiet'='FRP'; 'FrP'='FRP'; 'Frp'='FRP'; 
                       'Kystpartiet'='KY'; 
                       'R?d Valgallianse'='RV'; 'RV'='RV'; 'Rv'='RV'; 
                       'Hadde ikke stemmerett'='NOVOTE'; 'Hadde ikke stemmerett'='NOVOTE'; 'Hadde ikke stemmeret'='NOVOTE'; 'Stemte ikke'='NOVOTE'; 
                       else=NA")  

table(d$party)


######################################################################
#---------------------------- START ---------------------------------# 
######################################################################

# Age distribution 
age_dist <- as.data.frame(table(d$age, d$polpref))
age_dist$name <- paste("AGE", age_dist$Var1, sep ="")
age_dist$name <- paste(age_dist$name, age_dist$Var2, sep = "_")
age_dist <- age_dist %>% select(name, Freq)

# Education distribution
edu_dist <- as.data.frame(table(d$edu, d$polpref))
edu_dist$name <- paste("EDU", edu_dist$Var1, sep ="")
edu_dist$name <- paste(edu_dist$name, edu_dist$Var2, sep = "_")
edu_dist <- edu_dist %>% select(name, Freq)

# Income HOUSHOLD distribution
hinc_dist <- as.data.frame(table(d$hincome, d$polpref))
hinc_dist$name <- paste("HINC", hinc_dist$Var1, sep ="")
hinc_dist$name <- paste(hinc_dist$name, hinc_dist$Var2, sep = "_")
hinc_dist <- hinc_dist %>% select(name, Freq)

# Party distribution
p_dist <- as.data.frame(table(d$party, d$polpref))
p_dist$name <- paste("P", p_dist$Var1, p_dist$Var2, sep ="_")
p_dist <- p_dist %>% select(name, Freq)

# Region distribution
rg_dist <- as.data.frame(table(d$region, d$polpref))
rg_dist$name <- paste("RG", rg_dist$Var1, rg_dist$Var2, sep ="_")
rg_dist <- rg_dist %>% select(name, Freq)

# Fylke distribution
#fy_dist <- as.data.frame(table(d$fylke, d$polpref))
#fy_dist$name <- paste("FY", fy_dist$Var1, fy_dist$Var2, sep ="_")
#fy_dist <- fy_dist %>% select(name, Freq)

# Gender
g_dist <- as.data.frame(table(d$gender, d$polpref))
g_dist$name <- paste(g_dist$Var1, g_dist$Var2, sep ="_")
g_dist <- g_dist %>% select(name, Freq)

# Overall
o_dist <- as.data.frame(table(d$polpref))
o_dist$name <- paste("OVERALL", o_dist$Var1, sep ="_")
o_dist <- o_dist %>% select(name, Freq)

###### Single variable distributions: 
# Single Age distribution 
age_dist_s <- as.data.frame(table(d$age))
age_dist_s$name <- paste("S_AGE", age_dist_s$Var1, sep ="")
age_dist_s <- age_dist_s %>% select(name, Freq)

# Single Education distribution
edu_dist_s <- as.data.frame(table(d$edu))
edu_dist_s$name <- paste("S_EDU", edu_dist_s$Var1, sep ="")
edu_dist_s <- edu_dist_s %>% select(name, Freq)

# Single Income HOUSHOLD distribution
hinc_dist_s <- as.data.frame(table(d$hincome))
hinc_dist_s$name <- paste("S_HINC", hinc_dist_s$Var1, sep ="")
hinc_dist_s <- hinc_dist_s %>% select(name, Freq)

# Rbind the data frames
m1 <- rbind(age_dist, edu_dist, hinc_dist,  p_dist, age_dist_s, edu_dist_s, hinc_dist_s, rg_dist, o_dist, g_dist) 
m2 <- data.frame(t(m1))
colnames(m2) <- as.character(unlist(m2[1,]))
m2 = m2[-1, ]
m2$ISSUE <- issue_row_num
m2 <- mutate_all(m2, function(x) as.numeric(as.character(x)))

m2$ORG_VARNAME <- polpref_var_name
m2$ORG_DATASET_N <- nrow(d)

# Rbind with master data 
master_data <- read_excel("~/PhD/Dataset/PhD Dataset okt 3.xlsx")
rbind.all.columns <- function(x, y) {
  x.diff <- setdiff(colnames(x), colnames(y))
  y.diff <- setdiff(colnames(y), colnames(x))
  
  x[, c(as.character(y.diff))] <- NA
  
  y[, c(as.character(x.diff))] <- NA
  
  return(rbind(x, y))
}
master_data <- rbind.all.columns(master_data, m2)
master_data <- master_data %>% arrange(ISSUE)
write.xlsx(master_data, '~/PhD/Dataset/PhD Dataset okt 3.xlsx')

table(d$polpref,d$fylke)
table(d$polpref,d$age)
table(d$polpref,d$gender)
table(d$polpref,d$income)
table(d$polpref,d$edu)

######################################################################
#----------------------------- END ----------------------------------# 
######################################################################
